Tutorial for the WGCNA CRAN package (https://CRAN.R-project.org/package=WGCNA) adapted from ‘Tutorials for the WGCNA package’ (https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html).
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
##
##
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
##
## cor
# Here are input parameters of the simulation model
# number of samples or microarrays in the training data
no.obs=50
# now we specify the true measures of eigengene significance
# recall that ESturquoise=cor(y,MEturquoise)
ESturquoise=0; ESbrown= -.6;
ESgreen=.6;ESyellow=0
# Note that we don’t specify the eigengene significance of the blue module
# since it is highly correlated with the turquoise module.
ESvector=c(ESturquoise,ESbrown,ESgreen,ESyellow)
# number of genes
nGenes1=3000
# proportion of genes in the turquoise, blue, brown, green, and yellow module #respectively.
simulateProportions1=c(0.2,0.15, 0.08, 0.06, 0.04)
# Note that the proportions don’t add up to 1. The remaining genes will be colored grey,
# ie the grey genes are non-module genes.
# set the seed of the random number generator. As a homework exercise change this seed.
set.seed(1)
#Step 1: simulate a module eigengene network.
# Training Data Set I
MEgreen=rnorm(no.obs)
scaledy=MEgreen*ESgreen+sqrt(1-ESgreen^2)*rnorm(no.obs)
y=ifelse( scaledy>median(scaledy),2,1)
MEturquoise= ESturquoise*scaledy+sqrt(1-ESturquoise^2)*rnorm(no.obs)
# we simulate a strong dependence between MEblue and MEturquoise
MEblue= .6*MEturquoise+ sqrt(1-.6^2) *rnorm(no.obs)
MEbrown= ESbrown*scaledy+sqrt(1-ESbrown^2)*rnorm(no.obs)
MEyellow= ESyellow*scaledy+sqrt(1-ESyellow^2)*rnorm(no.obs)
ModuleEigengeneNetwork1=data.frame(y,MEturquoise,MEblue,MEbrown,MEgreen, MEyellow)
ModuleEigengeneNetwork1[1:3, ]
## y MEturquoise MEblue MEbrown MEgreen MEyellow
## 1 1 -0.62036668 -0.01207033 0.361954 -0.6264538 0.13622189
## 2 1 0.04211587 0.01042166 1.578760 0.1836433 0.40716760
## 3 1 -0.91092165 -0.80100769 1.406360 -0.8356286 -0.06965481
nrow(ModuleEigengeneNetwork1)
## [1] 50
dat1=simulateDatExpr5Modules(MEturquoise=ModuleEigengeneNetwork1$MEturquoise,
MEblue=ModuleEigengeneNetwork1$MEblue,
MEbrown=ModuleEigengeneNetwork1$MEbrown,
MEyellow=ModuleEigengeneNetwork1$MEyellow,
MEgreen=ModuleEigengeneNetwork1$MEgreen,
nGenes=nGenes1,
simulateProportions=simulateProportions1)
class(dat1)
## [1] "list"
str(dat1)
## List of 3
## $ datExpr : num [1:50, 1:3000] -0.769 0.201 -0.893 0.286 -0.622 ...
## $ truemodule: chr [1:3000] "turquoise" "turquoise" "turquoise" "turquoise" ...
## $ datME :'data.frame': 50 obs. of 5 variables:
## ..$ MEturquoise: num [1:50] -0.6204 0.0421 -0.9109 0.158 -0.6546 ...
## ..$ MEblue : num [1:50] -0.0121 0.0104 -0.801 -0.6487 -1.5827 ...
## ..$ MEbrown : num [1:50] 0.362 1.579 1.406 -0.297 -2.635 ...
## ..$ MEyellow : num [1:50] 0.1362 0.4072 -0.0697 -0.2477 0.6956 ...
## ..$ MEgreen : num [1:50] -0.626 0.184 -0.836 1.595 0.33 ...
dat1$datME[1:3, ]
## MEturquoise MEblue MEbrown MEyellow MEgreen
## 1 -0.62036668 -0.01207033 0.361954 0.13622189 -0.6264538
## 2 0.04211587 0.01042166 1.578760 0.40716760 0.1836433
## 3 -0.91092165 -0.80100769 1.406360 -0.06965481 -0.8356286
dat1$datExpr[1:5, 1:3]
## [,1] [,2] [,3]
## [1,] -0.7690318 -0.25281902 0.33073345
## [2,] 0.2010896 0.42931409 -0.27419755
## [3,] -0.8929247 -1.10073581 0.70294278
## [4,] 0.2861690 0.05983439 0.02233195
## [5,] -0.6224324 -0.61522751 0.80939447
datExpr = dat1$datExpr
truemodule = dat1$truemodule
datME = dat1$datME
# attach(ModuleEigengeneNetwork1)
table(truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 450 240 120 1410 600 180
dim(datExpr)
## [1] 50 3000
datExpr=data.frame(datExpr)
ArrayName=paste("Sample",1:dim(datExpr)[[1]], sep="" )
# The following code is useful for outputting the simulated data
GeneName=paste("Gene",1:dim(datExpr)[[2]], sep="" )
dimnames(datExpr)[[1]]=ArrayName
dimnames(datExpr)[[2]]=GeneName
# Load WGCNA package
library(WGCNA)
datGeneSummary=read.csv("GeneSummaryTutorial.csv")
datTraits=read.csv("TraitsTutorial.csv")
datMicroarrays=read.csv("MicroarrayDataTutorial.csv")
#=====================================================================================
#
# Code chunk 3
#
#=====================================================================================
# This vector contains the microarray sample names
ArrayName= names(data.frame(datMicroarrays[,-1]))
# This vector contains the gene names
GeneName= datMicroarrays$GeneName
# We transpose the data so that the rows correspond to samples and the columns correspond to genes
# Since the first column contains the gene names, we exclude it.
datExpr=data.frame(t(datMicroarrays[,-1]))
names(datExpr)=datMicroarrays[,1]
dimnames(datExpr)[[1]]=names(data.frame(datMicroarrays[,-1]))
#Also, since we simulated the data, we know the true module color:
truemodule= datGeneSummary$truemodule
rm(datMicroarrays)
collectGarbage()
#=====================================================================================
#
# Code chunk 4
#
#=====================================================================================
# First, make sure that the array names in the file datTraits line up with those in the microarray data
table( dimnames(datExpr)[[1]]==datTraits$ArrayName)
# Next, define the microarray sample trait
y=datTraits$y
# Load WGCNA package
library(WGCNA)
# The following setting is important, do not omit.
#options(stringsAsFactors = FALSE);
# Load the previously saved data
# load("Simulated-dataSimulation.RData");
# attach(ModuleEigengeneNetwork1)
meanExpressionByArray=apply( datExpr,1,mean, na.rm=T)
NumberMissingByArray=apply( is.na(data.frame(datExpr)),1, sum)
# sizeGrWindow(9, 5)
barplot(meanExpressionByArray,
xlab = "Sample", ylab = "Mean expression",
main ="Mean expression across samples",
names.arg = c(1:50), cex.names = 0.7)
# Keep only arrays containing less than 500 missing entries
KeepArray= NumberMissingByArray<500
table(KeepArray)
## KeepArray
## TRUE
## 50
datExpr=datExpr[KeepArray,]
y=y[KeepArray]
#ArrayName[KeepArray]
NumberMissingByGene =apply( is.na(data.frame(datExpr)),2, sum)
# One could do a barplot(NumberMissingByGene), but the barplot is empty in this case.
# It may be better to look at the numbers of missing samples using the summary method:
summary(NumberMissingByGene)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
# Calculate the variances of the probes and the number of present entries
variancedatExpr=as.vector(apply(as.matrix(datExpr),2,var, na.rm=T))
no.presentdatExpr=as.vector(apply(!is.na(as.matrix(datExpr)),2, sum) )
# Another way of summarizing the number of pressent entries
table(no.presentdatExpr)
## no.presentdatExpr
## 50
## 3000
# Keep only genes whose variance is non-zero and have at least 4 present entries
KeepGenes= variancedatExpr>0 & no.presentdatExpr>=4
table(KeepGenes)
## KeepGenes
## TRUE
## 3000
datExpr=datExpr[, KeepGenes]
#GeneName=GeneName[KeepGenes]
# sizeGrWindow(9, 5)
plotClusterTreeSamples(datExpr=datExpr, y=y)
# Load WGCNA package
library(WGCNA)
GS1= as.numeric(cor(y, datExpr, use="p"))
# Network terminology: GS1 will be referred to as signed gene significance measure
p.Standard=corPvalueFisher(GS1, nSamples =length(y) )
# since the q-value function has problems with missing data, we use the following trick
p.Standard2=p.Standard
p.Standard2[is.na(p.Standard)]=1
q.Standard=qvalue(p.Standard2)$qvalues
# Form a data frame to hold the results
StandardGeneScreeningResults=data.frame(GeneName,PearsonCorrelation=GS1, p.Standard, q.Standard)
NoiseGeneIndicator=is.element( truemodule, c("turquoise", "blue", "yellow", "grey"))+.0
SignalGeneIndicator=1-NoiseGeneIndicator
table(q.Standard<.20)
##
## FALSE TRUE
## 2997 3
mean(NoiseGeneIndicator[q.Standard<=0.20])
## [1] 0.6666667
#=====================================================================================
#
# Code chunk 6
#
#=====================================================================================
# save.image(file = "Simulated-StandardScreening.RData")
# Load WGCNA package
library(WGCNA)
# Load additional necessary packages
library(cluster)
# here we define the adjacency matrix using soft thresholding with beta=6
ADJ1=abs(cor(datExpr,use="p"))^6
# When you have relatively few genes (<5000) use the following code
k=as.vector(apply(ADJ1,2,sum, na.rm=T))
# When you have a lot of genes use the following code
k=softConnectivity(datE=datExpr,power=6)
## softConnectivity: FYI: connecitivty of genes with less than 17 valid samples will be returned as NA.
## ..calculating connectivities..
# Plot a histogram of k and a scale free topology plot
#sizeGrWindow(10,5)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k, main="Check scale free topology\n")
## scaleFreeRsquared slope
## 1 0.91 -2.08
datExpr=datExpr[, rank(-k,ties.method="first" )<=3600]
# Turn adjacency into a measure of dissimilarity
dissADJ=1-ADJ1
dissTOM=TOMdist(ADJ1)
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
collectGarbage()
pam4=pam(as.dist(dissADJ), 4)
pam5=pam(as.dist(dissADJ), 5)
pam6=pam(as.dist(dissADJ), 6)
# Cross-tabulte the detected and the true (simulated) module membership:
table(pam4$clustering, truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 1 51 8 17 792 530 1
## 2 380 10 34 261 37 6
## 3 12 4 17 167 17 172
## 4 7 218 52 190 16 1
table(pam5$clustering, truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 1 50 5 2 742 524 1
## 2 373 10 3 234 35 6
## 3 11 3 2 142 15 170
## 4 9 7 107 133 16 2
## 5 7 215 6 159 10 1
table(pam6$clustering, truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 1 39 3 1 440 310 1
## 2 21 5 3 412 235 1
## 3 363 10 3 186 26 6
## 4 11 3 2 119 8 169
## 5 9 6 106 119 14 2
## 6 7 213 5 134 7 1
pamTOM4=pam(as.dist(dissTOM), 4)
pamTOM5=pam(as.dist(dissTOM), 5)
pamTOM6=pam(as.dist(dissTOM), 6)
# Cross-tabulte the detected and the true (simulated) module membership:
table(pamTOM4$clustering, truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 1 58 15 34 1100 576 10
## 2 384 11 48 197 21 9
## 3 3 2 4 42 2 159
## 4 5 212 34 71 1 2
table(pamTOM5$clustering, truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 1 58 13 9 1087 574 10
## 2 384 10 7 190 20 9
## 3 0 4 97 28 3 0
## 4 3 2 0 40 2 159
## 5 5 211 7 65 1 2
table(pamTOM6$clustering, truemodule)
## truemodule
## blue brown green grey turquoise yellow
## 1 49 11 7 808 475 8
## 2 9 2 2 285 99 2
## 3 384 10 7 185 20 9
## 4 0 4 97 28 3 0
## 5 3 2 0 40 2 159
## 6 5 211 7 64 1 2
hierADJ=hclust(as.dist(dissADJ), method="average" )
# Plot the resulting clustering tree together with the true color assignment
#sizeGrWindow(10,5);
plotDendroAndColors(hierADJ, colors = data.frame(truemodule), dendroLabels = FALSE, hang = 0.03,
main = "Gene hierarchical clustering dendrogram and simulated module colors" )
colorStaticADJ=as.character(cutreeStaticColor(hierADJ, cutHeight=.99, minSize=20))
# Plot the dendrogram with module colors
# sizeGrWindow(10,5);
plotDendroAndColors(hierADJ, colors = data.frame(truemodule, colorStaticADJ),
dendroLabels = FALSE, abHeight = 0.99,
main = "Gene dendrogram and module colors")
branch.number=cutreeDynamic(hierADJ,method="tree")
# This function transforms the branch numbers into colors
colorDynamicADJ=labels2colors(branch.number )
colorDynamicHybridADJ=labels2colors(cutreeDynamic(hierADJ,distM= dissADJ,
cutHeight = 0.998, deepSplit=2, pamRespectsDendro = FALSE))
## ..done.
# Plot results of all module detection methods together:
# sizeGrWindow(10,5)
plotDendroAndColors(dendro = hierADJ,
colors=data.frame(truemodule, colorStaticADJ,
colorDynamicADJ, colorDynamicHybridADJ),
dendroLabels = FALSE, marAll = c(0.2, 8, 2.7, 0.2),
main = "Gene dendrogram and module colors")
# Calculate the dendrogram
hierTOM = hclust(as.dist(dissTOM),method="average");
# The reader should vary the height cut-off parameter h1
# (related to the y-axis of dendrogram) in the following
colorStaticTOM = as.character(cutreeStaticColor(hierTOM, cutHeight=.99, minSize=20))
colorDynamicTOM = labels2colors (cutreeDynamic(hierTOM,method="tree"))
colorDynamicHybridTOM = labels2colors(cutreeDynamic(hierTOM, distM= dissTOM , cutHeight = 0.998,
deepSplit=2, pamRespectsDendro = FALSE))
## ..done.
# Now we plot the results
# sizeGrWindow(10,5)
plotDendroAndColors(hierTOM,
colors=data.frame(truemodule, colorStaticTOM,
colorDynamicTOM, colorDynamicHybridTOM),
dendroLabels = FALSE, marAll = c(1, 8, 3, 1),
main = "Gene dendrogram and module colors, TOM dissimilarity")
tabStaticADJ=table(colorStaticADJ,truemodule)
tabStaticTOM=table(colorStaticTOM,truemodule)
tabDynamicADJ=table(colorDynamicADJ, truemodule)
tabDynamicTOM=table(colorDynamicTOM,truemodule)
tabDynamicHybridADJ =table(colorDynamicHybridADJ,truemodule)
tabDynamicHybridTOM =table(colorDynamicHybridTOM,truemodule)
randIndex(tabStaticADJ,adjust=F)
## [1] 0.5072126
randIndex(tabStaticTOM,adjust=F)
## [1] 0.5997922
randIndex(tabDynamicADJ,adjust=F)
## [1] 0.5033845
randIndex(tabDynamicTOM,adjust=F)
## [1] 0.5980453
randIndex(tabDynamicHybridADJ ,adjust=F)
## [1] 0.7054071
randIndex(tabDynamicHybridTOM ,adjust=F)
## [1] 0.7039642
colorh1= colorDynamicHybridTOM
# remove the dissimilarities, adjacency matrices etc to free up space
#rm(ADJ1); rm(dissADJ);
# collectGarbage()
# save.image("Simulated-NetworkConstruction.RData")
# Load WGCNA package
library(WGCNA)
datME=moduleEigengenes(datExpr,colorh1)$eigengenes
signif(cor(datME, use="p"), 2)
## MEblue MEbrown MEgreen MEgrey MEturquoise MEyellow
## MEblue 1.000 0.072 -0.2100 0.077 0.6700 -0.110
## MEbrown 0.072 1.000 -0.2800 -0.140 0.1200 0.079
## MEgreen -0.210 -0.280 1.0000 0.086 0.0092 -0.096
## MEgrey 0.077 -0.140 0.0860 1.000 0.2700 0.025
## MEturquoise 0.670 0.120 0.0092 0.270 1.0000 0.099
## MEyellow -0.110 0.079 -0.0960 0.025 0.0990 1.000
dissimME=(1-t(cor(datME, method="p")))/2
hclustdatME=hclust(as.dist(dissimME), method="average" )
# Plot the eigengene dendrogram
par(mfrow=c(1,1))
plot(hclustdatME, main="Clustering tree based of the module eigengenes")
# sizeGrWindow(8,9)
plotMEpairs(datME,y=y)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
signif(cor(datME, ModuleEigengeneNetwork1[,-1]),2)
## MEturquoise MEblue MEbrown MEgreen MEyellow
## MEblue 0.680 1.000 0.066 -0.190 -0.1100
## MEbrown 0.100 0.071 0.990 -0.280 0.0840
## MEgreen 0.015 -0.210 -0.270 0.990 -0.1100
## MEgrey 0.270 0.077 -0.140 0.047 0.0083
## MEturquoise 1.000 0.660 0.110 0.021 0.1200
## MEyellow 0.099 -0.120 0.110 -0.110 0.9900
# sizeGrWindow(8,9)
par(mfrow=c(3,1), mar=c(1, 2, 4, 1))
which.module="turquoise";
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),nrgcols=30,rlabels=T,
clabels=T,rcols=which.module,
title=which.module )
# for the second (blue) module we use
which.module="blue";
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),nrgcols=30,rlabels=T,
clabels=T,rcols=which.module,
title=which.module )
which.module="brown";
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),nrgcols=30,rlabels=T,
clabels=T,rcols=which.module,
title=which.module )
# sizeGrWindow(8,7);
which.module="green"
ME=datME[, paste("ME",which.module, sep="")]
par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),
nrgcols=30,rlabels=F,rcols=which.module,
main=which.module, cex.main=2)
par(mar=c(5, 4.2, 0, 0.7))
barplot(ME, col=which.module, main="", cex.main=2,
ylab="eigengene expression",xlab="array sample")
signif(cor(y,datME, use="p"),2)
## MEblue MEbrown MEgreen MEgrey MEturquoise MEyellow
## [1,] -0.1 -0.28 0.42 -0.18 0.056 0.099
cor.test(y, datME$MEbrown)
##
## Pearson's product-moment correlation
##
## data: y and datME$MEbrown
## t = -1.9826, df = 48, p-value = 0.05315
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.514098581 0.003495354
## sample estimates:
## cor
## -0.27512
p.values = corPvalueStudent(cor(y,datME, use="p"), nSamples = length(y))
GS1=as.numeric(cor(y,datExpr, use="p"))
GeneSignificance=abs(GS1)
# Next module significance is defined as average gene significance.
ModuleSignificance=tapply(GeneSignificance, colorh1, mean, na.rm=T)
# sizeGrWindow(8,7)
par(mfrow = c(1,1))
plotModuleSignificance(GeneSignificance,colorh1)
# collectGarbage()
# save.image("Simulated-RelatingToExt.RData")
# Load WGCNA package
library(WGCNA)
library(cluster)
# Load the previously saved data
# load("Simulated-RelatingToExt.RData");
ADJ1=abs(cor(datExpr,use="p"))^6
Alldegrees1=intramodularConnectivity(ADJ1, colorh1)
head(Alldegrees1)
## kTotal kWithin kOut kDiff
## Gene1 31.80186 28.37595 3.425906 24.95005
## Gene2 28.88249 26.47896 2.403522 24.07544
## Gene3 25.38600 23.11852 2.267486 20.85103
## Gene4 24.01574 22.12962 1.886122 20.24350
## Gene5 24.93663 21.69175 3.244881 18.44687
## Gene6 25.91260 23.92613 1.986469 21.93966
colorlevels=unique(colorh1)
# sizeGrWindow(9,6)
par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))
par(mar = c(4,5,3,1))
for (i in c(1:length(colorlevels)))
{
whichmodule=colorlevels[[i]];
restrict1 = (colorh1==whichmodule);
verboseScatterplot(Alldegrees1$kWithin[restrict1],
GeneSignificance[restrict1], col=colorh1[restrict1],
main=whichmodule,
xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)
}
datKME=signedKME(datExpr, datME, outputColumnName="MM.")
# Display the first few rows of the data frame
head(datKME)
## MM.blue MM.brown MM.green MM.grey MM.turquoise MM.yellow
## Gene1 0.6830511 0.11547756 -0.007124794 0.2840109 0.9481457 0.09588170
## Gene2 0.6342657 0.02257975 0.080277091 0.3029967 0.9356343 0.06889483
## Gene3 -0.6198067 -0.12531203 0.008372054 -0.2776929 -0.9121710 -0.17852211
## Gene4 0.5966736 0.06469079 0.049862112 0.2671967 0.9052030 0.11707603
## Gene5 0.6642214 0.14369720 -0.017975774 0.2442237 0.9017972 -0.01038067
## Gene6 -0.6018161 -0.15167072 0.006667131 -0.2053897 -0.9192597 -0.17138960
FilterGenes= abs(GS1)> .2 & abs(datKME$MM.brown)>.8
table(FilterGenes)
## FilterGenes
## FALSE TRUE
## 2989 11
dimnames(data.frame(datExpr))[[2]][FilterGenes]
## [1] "Gene1051" "Gene1052" "Gene1053" "Gene1054" "Gene1056" "Gene1057"
## [7] "Gene1059" "Gene1060" "Gene1061" "Gene1063" "Gene1075"
# sizeGrWindow(8,6)
par(mfrow=c(2,2))
# We choose 4 modules to plot: turquoise, blue, brown, green.
# For simplicity we write the code out explicitly for each module.
which.color="turquoise";
restrictGenes=colorh1==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^6")
which.color="blue";
restrictGenes=colorh1==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^6")
which.color="brown";
restrictGenes=colorh1==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^6")
which.color="green";
restrictGenes=colorh1==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^6")
NS1=networkScreening(y=y, datME=datME, datExpr=datExpr,
oddPower=3, blockSize=1000, minimumSampleSize=4,
addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)
## Proportion of agreement between lists based on abs(Cor.Weighted) and abs(Cor.Standard):
## Top 10 list of genes: prop. agree = 0.5
## Top 20 list of genes: prop. agree = 0.45
## Top 50 list of genes: prop. agree = 0.62
## Top 100 list of genes: prop. agree = 0.68
## Top 200 list of genes: prop. agree = 0.71
## Top 500 list of genes: prop. agree = 0.8
## Top 1000 list of genes: prop. agree = 0.814
# network screening analysis
mean(NoiseGeneIndicator[rank(NS1$p.Weighted,ties.method="first")<=100])
## [1] 0.2
# standard analysis based on the correlation p-values (or Student T test)
mean(NoiseGeneIndicator[rank(NS1$p.Standard,ties.method="first")<=100])
## [1] 0.48
topNumbers=c(10,20,50,100)
for (i in c(1:length(topNumbers)) )
{
print(paste("Proportion of noise genes in the top", topNumbers[i], "list"))
WGCNApropNoise=mean(NoiseGeneIndicator[rank(NS1$p.Weighted,ties.method="first")<=topNumbers[i]])
StandardpropNoise=mean(NoiseGeneIndicator[rank(NS1$p.Standard,ties.method="first")<=topNumbers[i]])
print(paste("WGCNA, proportion of noise=", WGCNApropNoise,
", Standard, prop. noise=", StandardpropNoise))
if (WGCNApropNoise< StandardpropNoise) print("WGCNA wins")
if (WGCNApropNoise==StandardpropNoise) print("both methods tie")
if (WGCNApropNoise>StandardpropNoise) print("standard screening wins")
}
## [1] "Proportion of noise genes in the top 10 list"
## [1] "WGCNA, proportion of noise= 0 , Standard, prop. noise= 0.4"
## [1] "WGCNA wins"
## [1] "Proportion of noise genes in the top 20 list"
## [1] "WGCNA, proportion of noise= 0.1 , Standard, prop. noise= 0.4"
## [1] "WGCNA wins"
## [1] "Proportion of noise genes in the top 50 list"
## [1] "WGCNA, proportion of noise= 0.14 , Standard, prop. noise= 0.4"
## [1] "WGCNA wins"
## [1] "Proportion of noise genes in the top 100 list"
## [1] "WGCNA, proportion of noise= 0.2 , Standard, prop. noise= 0.48"
## [1] "WGCNA wins"
# rm(dissTOM); collectGarbage()
#Form a data frame containing standard and network screening results
CorPrediction1=data.frame(GS1,NS1$cor.Weighted)
cor.Weighted=NS1$cor.Weighted
# Plot the comparison
# sizeGrWindow(8, 6)
verboseScatterplot(cor.Weighted, GS1,
main="Network-based weighted correlation versus Pearson correlation\n",
col=truemodule, cex.main = 1.2)
abline(0,1)
set.seed(2)
nSamples2=2000
MEgreen=rnorm(nSamples2)
scaledy2=MEgreen*ESgreen+sqrt(1-ESgreen^2)*rnorm(nSamples2)
y2=ifelse( scaledy2>median(scaledy2),2,1)
MEturquoise= ESturquoise*scaledy2+sqrt(1-ESturquoise^2)*rnorm(nSamples2)
# we simulate a strong dependence between MEblue and MEturquoise
MEblue= .6*MEturquoise+ sqrt(1-.6^2) *rnorm(nSamples2)
MEbrown= ESbrown*scaledy2+sqrt(1-ESbrown^2)*rnorm(nSamples2)
MEyellow= ESyellow*scaledy2+sqrt(1-ESyellow^2)*rnorm(nSamples2)
# Put together a data frame of eigengenes
ModuleEigengeneNetwork2=data.frame(y=y2,MEturquoise,MEblue,MEbrown,MEgreen, MEyellow)
# Simulate the expression data
dat2=simulateDatExpr5Modules(MEturquoise=ModuleEigengeneNetwork2$MEturquoise,
MEblue=ModuleEigengeneNetwork2$MEblue,MEbrown=ModuleEigengeneNetwork2$MEbrown,
MEyellow=ModuleEigengeneNetwork2$MEyellow,
MEgreen=ModuleEigengeneNetwork2$MEgreen,simulateProportions=simulateProportions1,
nGenes=nGenes1)
# recall that this is the signed gene significance in the training data
GS1= as.numeric(cor(y, datExpr, use="p"))
# the following is the signed gene significance in the test data
GS2=as.numeric( cor(ModuleEigengeneNetwork2$y, dat2$datExpr, use="p"))
# sizeGrWindow(8,6)
par(mfrow=c(1,1))
verboseScatterplot(GS1,GS2,
main="Trait-based gene significance in test set vs. training set\n",
xlab = "Training set gene significance",
ylab = "Test set gene significance",
col=truemodule, cex.main = 1.4)
EvaluationGeneScreening1 = corPredictionSuccess(
corPrediction = CorPrediction1,
corTestSet=GS2,
topNumber=seq(from=20, to=500, length=30) )
par(mfrow=c(2,2))
listcomp = EvaluationGeneScreening1$meancorTestSetOverall
matplot(x = listcomp$topNumber,
y = listcomp[,-1],
main="Predicting positive and negative correlations",
ylab="mean cor, test data",
xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreening1$meancorTestSetPositive
matplot(x = listcomp$topNumber,
y = listcomp[,-1],
main="Predicting positive correlations",
ylab="mean cor, test data",
xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreening1$meancorTestSetNegative
matplot(x = listcomp$topNumber,
y = listcomp[,-1],
main = "Predicting negative correlations",
ylab = "mean cor, test data",
xlab = "top number of genes in the training data")
relativeCorPredictionSuccess(corPredictionNew = NS1$cor.Weighted,
corPredictionStandard = GS1,
corTestSet=GS2,
topNumber=c(10,20,50,100,200,500) )
## topNumber corPredictionNew.kruskalP
## 1 10 1.725158e-02
## 2 20 3.437721e-03
## 3 50 2.042847e-05
## 4 100 6.861281e-07
## 5 200 1.451270e-05
## 6 500 1.350232e-04
# Create a data frame holding the results of gene screening
GeneResultsNetworkScreening=data.frame(GeneName=row.names(NS1), NS1)
# Write the data frame into a file
# write.table(GeneResultsNetworkScreening, file="GeneResultsNetworkScreening.csv", row.names=F,sep=",")
# Output of eigengene information:
datMEy = data.frame(y, datME)
eigengeneSignificance = cor(datMEy, y);
eigengeneSignificance[1,1] = (1+max(eigengeneSignificance[-1, 1]))/2
eigengeneSignificance.pvalue = corPvalueStudent(eigengeneSignificance, nSamples = length(y))
namesME=names(datMEy)
# Form a summary data frame
out1=data.frame(t(data.frame(eigengeneSignificance,
eigengeneSignificance.pvalue, namesME, t(datMEy))))
# Set appropriate row names
dimnames(out1)[[1]][1]="EigengeneSignificance"
dimnames(out1)[[1]][2]="EigengeneSignificancePvalue"
dimnames(out1)[[1]][3]="ModuleEigengeneName"
dimnames(out1)[[1]][-c(1:3)]=dimnames(datExpr)[[1]]
# Write the data frame into a file
# write.table(out1, file="MEResultsNetworkScreening.csv", row.names=TRUE, col.names = TRUE, sep=",")
# Display the first few rows:
head(out1)
## y MEblue MEbrown MEgreen
## EigengeneSignificance 0.70970248 -0.10142300 -0.27512004 0.41940495
## EigengeneSignificancePvalue 7.909070e-09 4.834028e-01 5.315189e-02 2.431272e-03
## ModuleEigengeneName y MEblue MEbrown MEgreen
## Sample1 1.000000000 -0.020390790 0.069159851 -0.138749932
## Sample2 1.00000000 0.01062292 0.24403639 0.01513903
## Sample3 1.000000000 -0.109366048 0.232823387 -0.159918300
## MEgrey MEturquoise MEyellow
## EigengeneSignificance -0.17512633 0.05594979 0.09932761
## EigengeneSignificancePvalue 2.238187e-01 6.995553e-01 4.925316e-01
## ModuleEigengeneName MEgrey MEturquoise MEyellow
## Sample1 -0.066877071 -0.050387559 0.002085968
## Sample2 -0.01593587 0.02762906 0.03332384
## Sample3 0.071309662 -0.121035286 -0.003678822
# Write out gene information
GeneName=dimnames(datExpr)[[2]]
GeneSummary=data.frame(GeneName, truemodule, SignalGeneIndicator, NS1)
# write.table(GeneSummary, file="GeneSummaryTutorial.csv", row.names=F,sep=",")
# here we output the module eigengenes and trait y without eigengene significances
datTraits=data.frame(ArrayName, datMEy)
dimnames(datTraits)[[2]][2:length(namesME)]=paste("Trait",
dimnames(datTraits)[[2]][2:length(namesME)],
sep=".")
# write.table(datTraits, file="TraitsTutorial.csv", row.names=F,sep=",")
rm(datTraits)
# here we output the simulated gene expression data
MicroarrayData=data.frame(GeneName, t(datExpr))
names(MicroarrayData)[-1]=ArrayName
# write.table(MicroarrayData, file="MicroarrayDataTutorial.csv", row.names=F,sep=",")
rm(MicroarrayData)
# Perform network screening
NS1GS=networkScreeningGS(datExpr=datExpr, datME = datME, GS=GS1)
## Proportion of agreement between GS.Weighted and GS:
## Top 10 list of genes: prop. of agreement = 0.3
## Top 20 list of genes: prop. of agreement = 0.4
## Top 50 list of genes: prop. of agreement = 0.36
## Top 100 list of genes: prop. of agreement = 0.33
## Top 200 list of genes: prop. of agreement = 0.32
## Top 500 list of genes: prop. of agreement = 0.424
## Top 1000 list of genes: prop. of agreement = 0.558
# Organize its results for easier plotting
GSprediction1=data.frame(GS1,NS1GS$GS.Weighted)
GS.Weighted=NS1GS$GS.Weighted
# Plot a comparison between standard gene significance and network-weighted gene significance
# sizeGrWindow(8, 6)
par(mfrow=c(1,1))
verboseScatterplot(GS1, GS.Weighted,
main="Weighted gene significance vs. the standard GS\n",
col=truemodule)
abline(0,1)
EvaluationGeneScreeningGS = corPredictionSuccess(corPrediction=GSprediction1, corTestSet=GS2,
topNumber=seq(from=20, to=500, length=30) )
# sizeGrWindow(8, 6)
par(mfrow=c(2,2))
listcomp= EvaluationGeneScreeningGS$meancorTestSetOverall
matplot(x=listcomp$topNumber,
y=listcomp[,-1],
main="Predicting positive and negative correlations",
ylab="mean cor, test data",
xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreeningGS$meancorTestSetPositive
matplot(x=listcomp$topNumber,
y=listcomp[,-1],
main="Predicting positive correlations",
ylab="mean cor, test data",
xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreeningGS$meancorTestSetNegative
matplot(x=listcomp$topNumber,
y=listcomp[,-1],
main="Predicting negative correlations",
ylab="mean cor, test data",
xlab="top number of genes in the training data")
# collectGarbage()
# save.image("Simulated-Screening.RData")
# Load WGCNA package
library(WGCNA)
library(cluster)
# Load the previously saved data
# load("Simulated-RelatingToExt.RData");
# load("Simulated-Screening.RData")
cmd1=cmdscale(as.dist(dissTOM),2)
# sizeGrWindow(7, 6)
par(mfrow=c(1,1))
plot(cmd1, col=as.character(colorh1), main="MDS plot",
xlab="Scaling Dimension 1", ylab="Scaling Dimension 2")
power=6
color1=colorDynamicTOM
restGenes= (color1 != "grey")
diss1=1-TOMsimilarityFromExpr( datExpr[, restGenes], power = 6 )
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
hier1=hclust(as.dist(diss1), method="average" )
diag(diss1) = NA;
# sizeGrWindow(7,7)
TOMplot(diss1^4, hier1, as.character(color1[restGenes]),
main = "TOM heatmap plot, module genes" )
power=6
color1=colorDynamicTOM
restGenes= (color1 != "grey")
diss1=1-adjacency( datExpr[, restGenes], power = 6 )
hier1=hclust(as.dist(diss1), method="average" )
diag(diss1) = NA;
# sizeGrWindow(7,7)
TOMplot(diss1^4, hier1, as.character(color1[restGenes]),
main = "Adjacency heatmap plot, module genes" )
# sizeGrWindow(7,7)
topList=rank(NS1$p.Weighted,ties.method="first")<=30
gene.names= names(datExpr)[topList]
# The following shows the correlations between the top genes
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
networkType="signed", useTOM=FALSE,
power=1, main="signed correlations")
# sizeGrWindow(7,7)
# The following shows the correlations between the top genes
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
networkType="unsigned", useTOM=FALSE,
power=1, main="signed correlations")
# sizeGrWindow(7,7)
# The following shows the TOM heatmap in a signed network
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
networkType="signed", useTOM=TRUE,
power=12, main="C. TOM in a signed network")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# The following shows the TOM heatmap in a unsigned network
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
networkType="unsigned", useTOM=TRUE,
power=6, main="D. TOM in an unsigned network")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.